library(tidyverse)
library(ggpubr)
devtools::load_all()
set.seed(44)
library(RColorBrewer)

Simulate the data

ncells <- 20000
nmarkers <- 20
beta_size <- 3
sigma_alpha <- 2
sigma_epsilon <- 1

# Biology design matrix
X <- matrix(c(rep(0,ncells/2), rep(1,ncells/2)), ncells, 1)

# Global mean (similair overall shift to CyTOF data)
mu <- matrix(10, ncells, nmarkers)

# Coefficients of interest
beta <- matrix(0, 1, nmarkers)
beta[c(1,3,5)] <- rep(beta_size, 3)

# Unwanted variation design matrix
# Note: This is UNCORRELATED with X
W <- matrix(rbinom(ncells, 1, 0.5), ncells, 1)

# Coeffcients of unwanted variation
alpha <- matrix(rnorm(nmarkers, sd = sigma_alpha), 1, nmarkers)

# Random error
epsilon <- matrix(rnorm(ncells*nmarkers, sd = sigma_epsilon), ncells, nmarkers)

# RUVIII model
Y <-  mu + X %*% beta + W %*% alpha + epsilon

Annotate the simulation data

colnames(Y) <- paste0("Marker", 1:nmarkers)

sample_id <- rep(LETTERS[1:2], each = ncells/2)

cluster_id <- rep(NA, ncells) # Add nonsense to make correct format

data <- data.frame(sample = sample_id, cluster = cluster_id, Y)

data$cluster <- cluster_FlowSOM(data, 4)

Plot markers

plot_marker_boxplots(data)

plot_marker_boxplots_samp(data)

PCA plots

pca <- prcomp(as.matrix(data[, 3:ncol(data)]))

# Look at to what extent the pca vectors captures batch effect
plots <- list(qplot(1:ncells, pca$x[,1]) + labs(x = "Index", y = "PCA1") + theme_bw(),
              qplot(1:ncells, pca$x[,2]) + labs(x = "Index", y = "PCA2") + theme_bw(),
              qplot(1:ncells, pca$x[,3]) + labs(x = "Index", y = "PCA3") + theme_bw(),
              qplot(1:ncells, pca$x[,4]) + labs(x = "Index", y = "PCA4") + theme_bw())

ggarrange(plotlist = plots)

ggarrange(plotlist = plot_scpca_samp(data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

ggarrange(plotlist = plot_scpca_clus(data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

tSNE plots

tsne_data <- compute_tsne(data, 1000)
## Warning: no DISPLAY variable so Tk is not available
##   Running t-SNE...with seed 42  DONE
plot_tsne_sample(tsne_data)

plot_tsne_cluster(tsne_data)

#plot_tsne_marker(data, tsne_data)

Normalise the data

nrep_groups <- 2
rand <- sample(c(0,1), ncells, replace = TRUE)

# random pseudo replicates
M <- cbind(rand, !rand)
# exactly wrong psuedo replicates
#M <- cbind(W, !W)
# perfect psedo replicates
#M <- cbind(X, !X)

# Check ruv documentation condition satisfied
any(rowSums(M) == 0)
## [1] FALSE
# Standardise the cell data
# Could refactor RUVIII wrappers to make this easier
raw_Y <- as.matrix(data[ ,3:ncol(data)])

col_means <- colMeans(raw_Y)
col_sds <- apply(raw_Y, 2, function(x) sd(x))

for(i in 1:ncol(raw_Y)){
  raw_Y[,i] <- (raw_Y[,i] - col_means[i])/col_sds[i]
}

exclude_neg_controls <- c(1,3,5)
controls <- c(1:nmarkers)[-exclude_neg_controls]

RUVIIout <- fastRUVIII(Y = Y, M, ctl = controls, k = 1)
norm_Y <- RUVIIout$newY
alpha <- RUVIIout$fullalpha
  
for(i in 1:ncol(norm_Y)){
  norm_Y[,i] <- norm_Y[,i]*col_sds[i] + col_means[i]
}

norm_data <- data
norm_data[3:ncol(norm_data)] <- norm_Y
#pheatmap::pheatmap(alpha, cluster_cols = FALSE, cluster_rows = FALSE)
#RMY <- Y - M %*% solve(t(M) %*% M) %*% t(M) %*% Y
#pca <- prcomp(RMY)$x
#qplot(pca[,1], pca[,2]) + labs(x = "PCA1", y = "PCA2") + theme_bw()

Plot markers

plot_marker_boxplots(norm_data)

plot_marker_boxplots_samp(norm_data)

PCA plots

ggarrange(plotlist = plot_scpca_samp(norm_data, 2000), nrow = 1, ncol = 3, common.legend = TRUE)

tSNE plots

tsne_data <- compute_tsne(norm_data, 1000)
##   Running t-SNE...with seed 42  DONE
plot_tsne_sample(tsne_data)

plot_tsne_cluster(tsne_data)